/*******************************************************************************
* Objective: Create "dataset_maps_and_cities_classwk.dta"
*******************************************************************************/

version 16

* Stablish Working Directory ***************************************************
cd "$workdirectory"

* Settings *********************************************************************
capture log close
clear all
set more off

********************************************************************************
**# Step 1: Creating share of subdistricts located within a FUA
********************************************************************************

* Importing intersection of FUAs and subdistricts
import delimited raw_datasets\Maps\intersection.csv, encoding(UTF-8) clear
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

* Importing area of subdistricts
* We drop observations without geolevel2 and also observations with geolevel == 888888888, as this seems to identify bodies of water and is not a unique identifier in the sample of subdistricts at world level (it should not contain information on labor shares neither)

import delimited raw_datasets\Maps\area_subdist.csv, encoding(UTF-8) clear 
keep geolevel2 area_subd
drop if geolevel2 == .
drop if geolevel2 == 888888888
tempfile area_subdist
save `area_subdist', replace

* Merging and obtaining shares
use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
merge m:1 geolevel2 using `area_subdist'
keep if _merge == 3
drop _merge

gen share = area_calcu/area_subd
sort geolevel2
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 2: Generate class of work per city data
********************************************************************************

* Generating empty file to attach data to
clear all
tempfile data_classwk_cities
save `data_classwk_cities', empty replace

* Genearting local with all .csv folder
local files : dir "raw_datasets\Maps\classwk_maps" files "*.csv"

cd raw_datasets\Maps\classwk_maps

* Importing, appending and saving
foreach fi in `files' {
  import delimited `fi', varnames(12) rowrange(13) encoding(UTF-8) clear
  gen country_year = subinstr("`fi'","_classwk.csv","",.)
  append using `data_classwk_cities'
  save `data_classwk_cities', replace
}

* Cleaning
keep if v1 == "Weighted N"
drop v1
rename v2 census
drop if census == "COL TOTAL"

* COUNTRIES WITH MORE THAN 1 COUNTRY_YEAR PER YEAR
foreach letter in a b c d e f{
	replace country_year = "brasil_1980" if country_year == "brasil`letter'_1980"
	replace country_year = "brasil_1991" if country_year == "brasil`letter'_1991"
	replace country_year = "brasil_2000" if country_year == "brasil`letter'_2000"
	replace country_year = "brasil_2010" if country_year == "brasil`letter'_2010"
}

* Exctracting geolevel2
gen geolevel2 = regexs(0) if(regexm(census, "[0-9]+"))
destring geolevel2, replace

save `data_classwk_cities', replace

*-------------------------------------------------------------------------------
* Adding Mexico

* Importing
use ipums_mexico_classwk, clear

* Cleaning and collapsing by year, municipality and industry
drop country sample serial hhwt urban pernum age classwkd
sort year geolev2
collapse (sum) perwt, by(year geolev2 classwk)

* Reshaping to fit main dataset format
reshape wide perwt, i(year geolev2) j(classwk)

* Generating variable country_year
gen country_year = ""
foreach y of numlist 1960 1970 1990 1995 2000 2010 2015{
	replace country_year = "mexico_`y'" if year == `y'
}

* Cleaning to fit main dataset
drop year
rename geolev2 geolevel2

* Replacing missings with 0s
foreach v of varlist perwt*{
	replace `v' = 0 if `v' == .
}

* Renaming to match with main dataset
rename perwt0		niunotinuniverse
rename perwt1		selfemployed
rename perwt2		wagesalaryworker
rename perwt3		unpaidworker
*rename perwt4		other
rename perwt9		unknownmissing

* Generating rowtotal variable
egen rowtotal = rowtotal(niunotinuniverse selfemployed wagesalaryworker unpaidworker unknownmissing)

* Appending to main dataset
append using `data_classwk_cities'

* Saving
save `data_classwk_cities', replace

*-------------------------------------------------------------------------------
* Adding Turkey

use ipums_turkey_classwk, clear

* Keeping only urban areas
egen urban = rowtotal(tr1985a_urban tr1990a_urban tr2000a_urban)
keep if urban == 1
drop urban tr1985a_urban tr1990a_urban tr2000a_urban

* Cleaning and collapsing by year, municipality and industry
drop country sample serial hhwt pernum age classwkd
sort year geolev2
collapse (sum) perwt, by(year geolev2 classwk)

* Reshaping to fit main dataset format
reshape wide perwt, i(year geolev2) j(classwk)

* Generating variable country_year
gen country_year = ""
foreach y of numlist 1985 1990 2000{
	replace country_year = "turkey_`y'" if year == `y'
}

* Cleaning to fit main dataset
drop year
rename geolev geolevel2

* Replacing missings with 0s
foreach v of varlist perwt*{
	replace `v' = 0 if `v' == .
}

* Renaming to match with main dataset
rename perwt0		niunotinuniverse
rename perwt1		selfemployed
rename perwt2		wagesalaryworker
rename perwt3		unpaidworker
*rename perwt4		other
rename perwt9		unknownmissing

* Generating rowtotal variable
egen rowtotal = rowtotal(niunotinuniverse selfemployed wagesalaryworker unpaidworker unknownmissing)

* Appending to main dataset
append using `data_classwk_cities'

* Saving
save `data_classwk_cities', replace

*-------------------------------------------------------------------------------
* Adding Spain

use ipums_spain_classwk, clear

* Keeping only urban population -  Based on World Urbanization Prospects The 2011 Revision, municipalities with over 10,000 people are consider urban
keep if es1991a_munsize > 08
drop es1991a_munsize

* Cleaning and collapsing by year, municipality and industry
drop country sample serial hhwt pernum age classwkd
sort year geolev2
collapse (sum) perwt, by(year geolev2 classwk)

* Reshaping to fit main dataset format
reshape wide perwt, i(year geolev2) j(classwk)

* Generating variable country_year
gen country_year = "spain_1991"

* Cleaning to fit main dataset
drop year
rename geolev geolevel2

* Replacing missings with 0s
foreach v of varlist perwt*{
	replace `v' = 0 if `v' == .
}

* Renaming to match with main dataset
rename perwt0		niunotinuniverse
rename perwt1		selfemployed
rename perwt2		wagesalaryworker
rename perwt3		unpaidworker
rename perwt4		other
*rename perwt9		unknownmissing

* Generating rowtotal variable
egen rowtotal = rowtotal(niunotinuniverse selfemployed wagesalaryworker unpaidworker other)

* Appending to main dataset
append using `data_classwk_cities'

* Saving
sort geolevel2
save `data_classwk_cities', replace

********************************************************************************

cd ..\..\..

********************************************************************************
**# Step 3: Assigning Employment by class of work to each City
********************************************************************************

* Joining both datasets
use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
joinby geolevel2 using `data_classwk_cities'

* Applying Shares
foreach v of varlist niunotinuniverse selfemployed wagesalaryworker unpaidworker other rowtotal unknownmissing{
	gen _`v' = `v'*share
}

* Fixing country names from both datasets
replace cntry_na_1 = "Burkina Faso" 		if cntry_na_1 == "BurkinaFaso"
replace cntry_na_1 = "Costa Rica" 			if cntry_na_1 == "CostaRica"
replace cntry_na_1 = "Dominican Republic" 	if cntry_na_1 == "DominicanRepublic"
replace cntry_na_1 = "El Salvador" 			if cntry_na_1 == "ElSalvador"
replace cntry_na_1 = "Kyrghzstan" 			if cntry_na_1 == "Kyrgyzstan"
replace cntry_na_1 = "Papua New Guinea" 	if cntry_na_1 == "PapuaNewGuinea"

* Drop intersections which associate subdistricts from country i to a city located in country j
drop if cntry_name != cntry_na_1

* Collapsing by city and census
collapse (sum) _* (mean) fua_p_2015 (first) efua_name ,by(efua_id country_year)

foreach v2 of varlist _selfemployed _wagesalaryworker _unpaidworker _other _unknownmissing {
	gen share`v2' = `v2'/(_rowtotal-_niunotinuniverse)
}

save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 4: Adding ADM1, ADM3 and other countries datasets
********************************************************************************

* ADM1
do "codes\secondary_dos\classwk_maps_and_cities_adm1_data_creator.do"
use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
append using "processed_datasets\dataset_maps_and_cities_adm1_classwk"
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

* CH

* ADM3
do "codes\secondary_dos\classwk_maps_and_cities_adm3_data_creator.do"
use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
append using "processed_datasets\dataset_maps_and_cities_adm3_classwk"
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

* CH

* Canada, Phillipines, US and Italy
do "codes\secondary_dos\classwk_maps_and_cities_indcountries_data_creator.do"
use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
append using "processed_datasets\maps_and_cities_canada_classwk"
append using "processed_datasets\maps_and_cities_philippines_classwk"
append using "processed_datasets\maps_and_cities_usa_classwk"
append using "processed_datasets\maps_and_cities_italy_classwk"
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 4: Obtaining Urban Center populations
********************************************************************************

* Importing areas of UCs
import delimited raw_datasets\Maps\area_uc.csv, encoding(UTF-8) clear
keep id_hdc_g0 area_uc
tempfile uc_pop
save `uc_pop'

* Importing intersection of UCs and FUAs
import delimited raw_datasets\Maps\intersection_fua_uc.csv, encoding(UTF-8) clear
keep efua_id uc_ids efua_name cntry_name id_hdc_g0 ctr_mn_nm uc_nm_mn p75 p90 p00 p15 area_inter uc_p_2015

* Merging with UC area file
merge m:1 id_hdc_g0 using `uc_pop'
keep if _merge == 3
drop _merge

* Calculating shares of area of intersection UC-FUA on UC area
gen share_uc = area_inter/area_uc

sort efua_id id_hdc_g0

* Generating adjusted population based on new shares
gen p75_adj = p75 * share_uc
gen p90_adj = p90 * share_uc
gen p00_adj = p00 * share_uc
gen p15_adj = p15 * share_uc

tempfile fua_uc_pop
save `fua_uc_pop', replace

* Calculating FUA-UC population option A: Using only largest UC within FUA
use `fua_uc_pop', clear

gsort efua_id -p15_adj
by efua_id: egen double maximum = max(p15_adj)
keep if p15_adj == maximum
keep efua_id p75_adj p90_adj p00_adj p15_adj

rename p75_adj p75_adj_max
rename p90_adj p90_adj_max
rename p00_adj p00_adj_max
rename p15_adj p15_adj_max

la var p75_adj_max "UC pop 75 using max. UC within FUA"
la var p90_adj_max "UC pop 90 using max. UC within FUA"
la var p00_adj_max "UC pop 00 using max. UC within FUA"
la var p15_adj_max "UC pop 15 using max. UC within FUA"

tempfile fua_uc_pop_max
save `fua_uc_pop_max', replace

* Calculating FUA-UC population option B: Using all UCs withing FUA
use `fua_uc_pop', clear

collapse (sum) p75_adj p90_adj p00_adj p15_adj, by(efua_id)

rename p75_adj p75_adj_wgt
rename p90_adj p90_adj_wgt
rename p00_adj p00_adj_wgt
rename p15_adj p15_adj_wgt

la var p75_adj_wgt "UC pop 75 using weighted pop. of all UCs within FUA"
la var p90_adj_wgt "UC pop 90 using weighted pop. of all UCs within FUA"
la var p00_adj_wgt "UC pop 00 using weighted pop. of all UCs within FUA"
la var p15_adj_wgt "UC pop 15 using weighted pop. of all UCs within FUA"

tempfile fua_uc_pop_wgt
save `fua_uc_pop_wgt', replace

* Calculating FUA-UC population option C: Using 2015 UC population measures in UC and FUA datasets
use `fua_uc_pop', clear

drop area_inter area_uc share_uc p75_adj p90_adj p00_adj p15_adj
collapse (sum) p75 p90 p00 p15 (mean) uc_p_2015, by(efua_id)
gen share = uc_p_2015/p15

gen p75_adj_sh = p75*share
gen p90_adj_sh = p90*share
gen p00_adj_sh = p00*share
gen p15_adj_sh = p15*share

la var p75_adj_sh "UC pop 75 using FUA and UC pop. measures for 2015 as shares"
la var p90_adj_sh "UC pop 90 using FUA and UC pop. measures for 2015 as shares"
la var p00_adj_sh "UC pop 00 using FUA and UC pop. measures for 2015 as shares"
la var p15_adj_sh "UC pop 15 using FUA and UC pop. measures for 2015 as shares"

keep efua_id p75_adj_sh p90_adj_sh p00_adj_sh p15_adj_sh

tempfile fua_uc_pop_sh
save `fua_uc_pop_sh', replace

* Merging to main dataset

use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
merge m:1 efua_id using `fua_uc_pop_max'
keep if _merge == 3
drop _merge

merge m:1 efua_id using `fua_uc_pop_wgt'
keep if _merge == 3
drop _merge

merge m:1 efua_id using `fua_uc_pop_sh'
keep if _merge == 3
drop _merge

save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 5: Merging with city coordinates
********************************************************************************
import delimited raw_datasets\Maps\cities_coordinates.csv, encoding(UTF-8) clear
keep x y efua_id cntry_name
tempfile cities_coordinates
save `cities_coordinates', replace

use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
merge m:1 efua_id using `cities_coordinates'
keep if _merge == 3
drop _merge

save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 6: Identify capital cities
********************************************************************************

* 1 : Main Capital
* 2 : Alternative Capital

gen capital = 0
replace capital = 1 if efua_id == 4215
replace capital = 1 if efua_id == 1992
replace capital = 2 if efua_id == 1657
replace capital = 1 if efua_id == 209
replace capital = 2 if efua_id == 1165
replace capital = 1 if efua_id == 5465
replace capital = 1 if efua_id == 976
replace capital = 1 if efua_id == 2295
replace capital = 1 if efua_id == 2072
replace capital = 1 if efua_id == 7704
replace capital = 1 if efua_id == 4048
replace capital = 1 if efua_id == 1674
replace capital = 1 if efua_id == 2569
replace capital = 1 if efua_id == 3981
replace capital = 1 if efua_id == 888
replace capital = 1 if efua_id == 5821
replace capital = 1 if efua_id == 2940
replace capital = 1 if efua_id == 2162
replace capital = 1 if efua_id == 550
replace capital = 1 if efua_id == 1302
replace capital = 1 if efua_id == 1301
replace capital = 1 if efua_id == 7466
replace capital = 1 if efua_id == 4897
replace capital = 1 if efua_id == 4686
replace capital = 1 if efua_id == 2709
replace capital = 1 if efua_id == 1507
replace capital = 1 if efua_id == 567
replace capital = 1 if efua_id == 98
replace capital = 1 if efua_id == 446
replace capital = 1 if efua_id == 1322
replace capital = 1 if efua_id == 586
replace capital = 1 if efua_id == 5006
replace capital = 1 if efua_id == 118
replace capital = 1 if efua_id == 1525
replace capital = 1 if efua_id == 594
replace capital = 1 if efua_id == 6502
replace capital = 1 if efua_id == 5621
replace capital = 1 if efua_id == 740
replace capital = 1 if efua_id == 875
replace capital = 1 if efua_id == 465
replace capital = 1 if efua_id == 2327
replace capital = 1 if efua_id == 1338
replace capital = 1 if efua_id == 156
replace capital = 1 if efua_id == 1890
replace capital = 1 if efua_id == 3226
replace capital = 1 if efua_id == 2797
replace capital = 1 if efua_id == 1894
replace capital = 1 if efua_id == 1245
replace capital = 1 if efua_id == 906
replace capital = 1 if efua_id == 7068
replace capital = 1 if efua_id == 4040
replace capital = 1 if efua_id == 1986
replace capital = 1 if efua_id == 2702
replace capital = 1 if efua_id == 2114
replace capital = 1 if efua_id == 799
replace capital = 1 if efua_id == 666
replace capital = 1 if efua_id == 566
replace capital = 1 if efua_id == 310
replace capital = 1 if efua_id == 3700
replace capital = 2 if efua_id == 189
replace capital = 2 if efua_id == 2060
replace capital = 1 if efua_id == 3596
replace capital = 1 if efua_id == 153
replace capital = 1 if efua_id == 3343
replace capital = 1 if efua_id == 3650
replace capital = 1 if efua_id == 3147
replace capital = 1 if efua_id == 2207
replace capital = 1 if efua_id == 2845
replace capital = 1 if efua_id == 1899
replace capital = 1 if efua_id == 3222
replace capital = 1 if efua_id == 2317
replace capital = 1 if efua_id == 57
replace capital = 1 if efua_id == 273
replace capital = 1 if efua_id == 164
replace capital = 1 if efua_id == 224
replace capital = 1 if efua_id == 121
replace capital = 1 if efua_id == 359
replace capital = 1 if efua_id == 3881
replace capital = 1 if efua_id == 101
replace capital = 1 if efua_id == 1091
replace capital = 1 if efua_id == 415
replace capital = 1 if efua_id == 3705
replace capital = 1 if efua_id == 1834
replace capital = 1 if efua_id == 2262

********************************************************************************
**# Step 7: Estimate and rename variables 
********************************************************************************

* Log of Population
gen log_fua_pop_2015 = log(fua_p_2015)
rename fua_p_2015 fua_pop_2015

* We rename the wagesalary worker var name  to match the informality dataset
rename _wagesalaryworker		_wage_worker
rename share_wagesalaryworker 	share_wage_worker

* Year
gen year = regexs(0) if(regexm(country_year, "[0-9][0-9][0-9][0-9]"))
destring year, replace

* Country
gen country = regexs(0) if(regexm(country_year, "([a-zA-Z]+)"))
replace country = proper(country)

save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

********************************************************************************
**# Step 7: Matching missing census categories
/*******************************************************************************
In some censuses a particular industry may not be present in the data. 
When appending all country-year data files, some categories will appear as 
missing for some countries if the category was not available for their 
census-year. When collapsing the dataset, all these missing values disappear 
and are replaced by 0s. It's not clear then, if a 0 is indeed a 0 or a 
hidden missing. We create a set of dummies for each industry category that take 
a value 1 if the industry was a valid option in each country-year census. 
These variables can be used as an aid to determine whether 0s represent 
real 0s or `hidden missings'.
*******************************************************************************/

* List of countries names as downloaded from IPUMS data extract matched to
* list of names used in the current dataset
import excel raw_datasets\Maps\IPUMS_cats_census.xlsx, sheet("countries_list") firstrow clear
tempfile fixname
save `fixname', replace

* List of missing categories from the census
import excel raw_datasets\Maps\IPUMS_cats_census.xlsx, sheet("CLASSWK") firstrow clear

* Matching countries names to the ones used in the current dataset
merge m:1 country_ipums using `fixname'
keep if _merge == 3
drop _merge

* Fixing formating of years (in order to destring the variable)
drop if country == "Italy" & year == "2011Q1"
drop if country == "Spain" & year != "1991"
drop if country == "Unitedkingdom"

foreach y of numlist 2012/2019{
	replace year = "`y'" if year == "`y'Q1" & country == "Italy"
}

destring year, replace

* Generating variables to match informality dataset

gen 	q_selfemployed_employer = "·"
replace q_selfemployed_employer = "X" if Employer == "X" | Sharecropperemployer == "X"

gen		q_selfemployed_ownaccount = "·" 
replace	q_selfemployed_ownaccount = "X" if Workingonownaccount == "X" | Ownaccountagriculture == "X" | Domesticworkerselfemployed == "X" | Subsistenceworkerownconsumpt == "X" | Ownaccountother == "X" | Ownaccountwithouttemporaryu == "X" | Ownaccountwithtemporaryunpa == "X"

gen		q_selfemployed_sharecropper = "·"
replace	q_selfemployed_sharecropper = "X" if Sharecropper == "X" | Sharecropperselfemployed == "X" | Sharecropperemployee == "X"

* We drop Selfemployednotspecified as it's only available for Russia and
* we don't need it for the maps_and_cities_classwk dataset
drop Selfemployednotspecified

* We drop the variables we have used above
drop Employer Sharecropperemployer Workingonownaccount Ownaccountagriculture Domesticworkerselfemployed Subsistenceworkerownconsumpt Ownaccountother Ownaccountwithouttemporaryu Ownaccountwithtemporaryunpa Sharecropper Sharecropperselfemployed Sharecropperemployee

* Renaming variables
rename NIUnotinuniverse 				q_niu
rename Selfemployed						q_selfemployed
rename Wagesalaryworker					q_wage_worker			
rename Unpaidworker						q_unpaidworker
rename Other							q_other
rename Unknownmissing					q_unknownmissing
rename SELFEMPLOYED						q_selfemployed_unspecified
rename Selfemployedunincorporated		q_selfemployed_uninincorporated
rename Selfemployedincorporated			q_selfemployed_incorporated
rename Memberofcooperative				q_selfemployed_cooperative
rename Kibbutzmember					q_kibbutzmember

* Replacing Xs for 1s
foreach v of varlist q_*{
	replace `v' = "1" if `v' == "X"
	replace `v' = "0" if `v' == "·"
	destring `v', replace
}

* Saving
drop country_ipums
save `fixname', replace

*-------------------------------------------------------------------------------
* Mathcing to main dataset

use "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", clear
merge m:1 country year using `fixname'
keep if _merge == 3
drop _merge

* Dropping irrelevant variables
drop q_selfemployed_unspecified q_selfemployed_uninincorporated q_selfemployed_incorporated q_selfemployed_cooperative q_kibbutzmember q_selfemployed_employer q_selfemployed_ownaccount q_selfemployed_sharecropper

********************************************************************************
**# Step 7: Labeling and Cleaning
********************************************************************************

drop cntry_name
drop q_niu

la var efua_id 					"ID of FUA"
la var country_year				"country and year of census"

la var _niunotinuniverse 		"(Weighted) number of people not in universe" 
la var _selfemployed 			"(Weighted) number of people self employed"
la var _wage_worker 			"(Weighted) number of people wage/salary worker"
la var _unpaidworker 			"(Weighted) number of people unpaid worker"
la var _other 					"(Weighted) number of people other class of work"
la var _rowtotal 				"(Weighted) number of people (Total inc. not in univ.)"
la var _unknownmissing			"(Weighted) number of people unknown/missing class of work"

la var fua_pop_2015				"FUA population in 2015"

la var share_selfemployed 		"Share of people self employed"
la var share_wage_worker 		"Share of people wage/salary worker"
la var share_unpaidworker 		"Share of people unpaid worker"
la var share_other 				"Share of people other class of work"
la var share_unknownmissing		"Share of people unknown/missing class of work"

la var x						"X coordinate of FUA"
la var y						"Y coordinate of FUA"

la var capital					"1 if FUA is main capital, 2 if it's second/alt capital"
la var log_fua_pop_2015			"Log of FUA population in 2015"

foreach v in selfemployed wage_worker unpaidworker other unknownmissing{
	la var q_`v'				"Value of 1 if census of country-year asks about `v'"
}

********************************************************************************
**# Step 8: Saving
********************************************************************************

sort efua_id country_year
save "processed_datasets\dataset_maps_and_cities_classwk_01.22.2022.dta", replace

